1.- Exploration of the distribution of species abundance and lengths in the raw libraries

Raw libraries

# abundance of sRNA species
### Select only for 21,22 and 24
species_srna = function(path, sample_name, sex_id, tissue_id) {
  table = data.table(read.table(path))
  counts = as.data.frame(table[, .N, by='V2']) %>% mutate(sample = sample_name, sex = sex_id, tissue = tissue_id)
  abundance = as.data.frame(table[, .N, by='V1']) %>% mutate(sample = sample_name, sex = sex_id, tissue = tissue_id) %>% filter(N > 2)
  list(table, counts, abundance)
}

### FEMALE SAMPLES
f1l=species_srna("raw/sRNA_MGX/trimmed/F1L_final_trimming.fastq_count.tsv", "F1L", "Female", "Leaf")
f1b=species_srna("raw/sRNA_MGX/trimmed/F1B_final_trimming.fastq_count.tsv", "F1B", "Female", "Flower bud")

f2l=species_srna("raw/sRNA_MGX/trimmed/F2L_final_trimming.fastq_count.tsv", "F2L", "Female", "Leaf")
f2b=species_srna("raw/sRNA_MGX/trimmed/F2B_final_trimming.fastq_count.tsv", "F2B", "Female", "Flower bud")

f3l=species_srna("raw/sRNA_MGX/trimmed/F3L_final_trimming.fastq_count.tsv", "F3L", "Female", "Leaf")
f3b=species_srna("raw/sRNA_MGX/trimmed/F3B_final_trimming.fastq_count.tsv", "F3B", "Female", "Flower bud")

### MALE SAMPLES

m1l=species_srna("raw/sRNA_MGX/trimmed/M1L_final_trimming.fastq_count.tsv", "M1L", "Male", "Leaf")
m1b=species_srna("raw/sRNA_MGX/trimmed/M1B_final_trimming.fastq_count.tsv", "M1B", "Male", "Flower bud")

m2l=species_srna("raw/sRNA_MGX/trimmed/M2L_final_trimming.fastq_count.tsv", "M2L", "Male", "Leaf")
m2b=species_srna("raw/sRNA_MGX/trimmed/M2B_final_trimming.fastq_count.tsv", "M2B", "Male", "Flower bud")

m4l=species_srna("raw/sRNA_MGX/trimmed/M4L_final_trimming.fastq_count.tsv", "M4L", "Male", "Leaf")
m4b=species_srna("raw/sRNA_MGX/trimmed/M4B_final_trimming.fastq_count.tsv", "M4B", "Male", "Flower bud")
### Species abundance

spp = do.call(rbind, lapply(list(f1l[[3]],
             f1b[[3]],
             f2l[[3]],
             f2b[[3]],
             f3l[[3]],
             f3b[[3]],
             m1l[[3]],
             m1b[[3]],
             m2l[[3]],
             m2b[[3]],
             m4l[[3]],
             m4b[[3]]), as.data.frame
            ) )

rm(f1b, f1l, f2b, f2l, f3b, f3l, m1b, m1l, m2b, m2l, m4b, m4l)

colnames(spp) = c("sRNA", "counts", "sample", "sex", "tissue")

spp = spp %>% mutate(log_counts = log2(counts))

#### Histogram 

ggplot(spp) +
  geom_density(aes(x=log_counts, color=sample), alpha=0.6) + 
  facet_grid(tissue ~ sex) +
  scale_color_manual(values = met.brewer("Hiroshige", 12)) +
  labs(y="Density", x="log2(Expression)")

ggsave("results/sp_expression.png", width = 6, height = 5, dpi = 600)
### With 320880141 total reads and mincov of 1 reads per million, the min read depth is 321
### For RPM > 1
no_uniq = spp[spp$counts>321, ]
ggplot(no_uniq) +
  geom_density(aes(x=log2(counts), color=sample), alpha=0.6) + 
  facet_grid(tissue ~ sex) +
  scale_color_manual(values = met.brewer("Hiroshige", 12)) +
  labs(y="Density", x="log2(Expression)")

ggsave("results/reduced_sp_expression.png", width = 6, height = 5, dpi = 600)
lgt = do.call(rbind, lapply(list(f1l[[2]],
             f1b[[2]],
             f2l[[2]],
             f2b[[2]],
             f3l[[2]],
             f3b[[2]],
             m1l[[2]],
             m1b[[2]],
             m2l[[2]],
             m2b[[2]],
             m4l[[2]],
             m4b[[2]]), as.data.frame
            ) )

ggplot(lgt) +
  geom_bar(aes(x=as.factor(V2), y=as.numeric(N), fill=sample), stat="identity", position = "dodge") +
  facet_grid(tissue ~ sex) +
  scale_fill_manual(values = met.brewer("Hiroshige", 12)) +
  labs(x="sRNA Length", y="Count")

ggsave("results/length.png", width = 6, height = 5, dpi = 600)

After the alignment and quantification with ShortStack

fracs = read.table("raw/ShortStack_results/alignment_details.tsv", header = T)
fracs = filter(fracs, mapping_type == "P", count > 0)  
fracs = cbind(data.frame(
  sample = rep(c(rep(1,6),rep(2,6), rep(3,6)), 2),
  sex = c(rep("Female", 18), rep("Male", 18)),
  organ = rep(c(rep("Flower bud", 3), rep("Leaf", 3)), 6)
) , fracs)
fracs = fracs %>% select(sample, sex, organ, read_length, count)

ggplot(fracs) + 
  geom_bar(aes(fill=read_length, y=count, x=sample), position="fill", stat="identity") +
  facet_grid(organ ~ sex) +
  scale_fill_manual(values = met.brewer("Hokusai3")) +
  labs(y="Percentage of aligned reads", x="Individual", fill="sRNA length") +
  theme(text = element_text(size=20))

ggsave("results/fractions.png", width = 6, height = 6, dpi = 600)

rm(fracs)
# set boundaries
chr_sizes = read.table("chromSizes.txt")
colnames(chr_sizes) = c("chr", "end")
chr_sizes = chr_sizes %>% mutate(start = "1") %>% relocate(start, .before = end)

# get density of PTGS clusters
ptgs_bed = read.table("raw/only_21-22_known_de_novo_f-y/Results.gff3")[, c(1, 4:5)]
colnames(ptgs_bed) = c("chr", "start", "end")

# get density of PTGS clusters
rddm_bed = read.table("raw/only_24_f-y/Results.gff3")[, c(1, 4:5)]
colnames(rddm_bed) = c("chr", "start", "end")

# load gene annotation
gene_bed = read.table("annotation/genes.bed")
#repeats_bed = read.table("annotation/repeats.bed", sep = "\t")
tes_bed =  read.table("annotation/tes.bed", sep = "\t")
tesorter_bed = read.table("tesorter_genome/tes_genome.dom.bed")

# plot densities
{
png(file="results/srna_density.png", width=2000, height=2000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=2)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
circos.genomicDensity(tes_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "number")
circos.genomicDensity(ptgs_bed, col = c("#5EBF75"), track.height = 0.1, count_by = "number")
circos.genomicDensity(rddm_bed, col = c("#6276F6"), track.height = 0.1, count_by = "number")
circos.clear()
dev.off()
}

# with TEsortert annotations (smaller file)
{
png(file="results/densities.png", width=2000, height=2000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=2)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
circos.genomicDensity(tesorter_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "number")
circos.genomicDensity(ptgs_bed, col = c("#5EBF75"), track.height = 0.1, count_by = "number")
circos.genomicDensity(rddm_bed, col = c("#6276F6"), track.height = 0.1, count_by = "number")
circos.clear()
dev.off()
}

#### Circos plot of sRNA mapping per 1mb windows

# create function
surco = function(toy, lib_size_1, lib_size_2, lib_size_3) {
          # set column names
          colnames(toy) = c("chr", "start", "end", "depth_1", "depth_2", "depth_3", "chr_w", "start_w", "end_w", "wind")
          # calculate mead depth per window, normalize by library size
          toy[, mean_depth_1 := mean(depth_1)/lib_size_1, by=.(wind)]
          toy[, mean_depth_2 := mean(depth_2)/lib_size_2, by=.(wind)]
          toy[, mean_depth_3 := mean(depth_3)/lib_size_3, by=.(wind)]
          # get one row per window
          toy = unique(toy, by = "wind")[, .(chr_w, start_w, end_w, wind, mean_depth_1, mean_depth_2, mean_depth_3)]
          # overall mean and multiply by 1m
          toy[, mean_depth := (mean_depth_1 + mean_depth_2 + mean_depth_3)*(1000000/3), by=1:nrow(toy)]
          toy[, log_depth := log(mean_depth), by=1:nrow(toy)]
          # print bed file + mean depth per million
          toy[, .(chr_w, start_w, end_w, mean_depth, log_depth)]
}

# load files and do computations (library sizes are in the jupyter notebook)

females_depth_ptgs = fread("raw/depth/females_flowers_ptgs.depth")
females_depth_ptgs = surco(females_depth_ptgs, 54935002, 29689313, 25245689)

females_depth_rddm = fread("raw/depth/females_flowers_rddm.depth")
females_depth_rddm = surco(females_depth_rddm, 54935002, 29689313, 25245689)

males_depth_ptgs = fread("raw/depth/males_flowers_ptgs.depth")
males_depth_ptgs = surco(males_depth_ptgs, 13224600, 23943681, 23582102)

males_depth_rddm = fread("raw/depth/males_flowers_rddm.depth")
males_depth_rddm = surco(males_depth_rddm, 13224600, 23943681, 23582102)


# plot circos
{
png(file="results/srna_depth.png", width=2000, height=2000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=2)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
circos.genomicDensity(tes_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "number")
circos.genomicHeatmap(males_depth_ptgs, na_col= "white", numeric.column="log_depth", heatmap_height = 0.05, connection_height=NULL, col= colorRamp2(c(-4, 5), c("white", "#2f8d45")))
circos.genomicHeatmap(males_depth_rddm, na_col= "white", numeric.column="log_depth", heatmap_height = 0.05, connection_height=NULL, col= colorRamp2(c(-4, 5), c("white", "#4453b1")))
circos.genomicHeatmap(females_depth_ptgs, na_col= "white", numeric.column="log_depth", heatmap_height = 0.05, connection_height=NULL, col= colorRamp2(c(-4, 5), c("white", "#2f8d45")))
circos.genomicHeatmap(females_depth_rddm, na_col= "white", numeric.column="log_depth", heatmap_height = 0.05, connection_height=NULL, col= colorRamp2(c(-4, 5), c("white", "#4453b1")))
circos.clear()
dev.off()
}

# Distribution of inverted repeats
inv_bed = read.table("results/inv_rep.bed"
                     
{
png(file="results/inv_rep.png", width=2000, height=2000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=2)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
circos.genomicDensity(tes_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "percent")
circos.genomicDensity(inv_bed, col = c("blue"), track.height = 0.1, count_by = "number")
circos.clear()
dev.off()
}

2.- Quantification analysis and differential expression of 21-22 sNRAs

PTGS

### All together
exp_ptgs = read.table("raw/only_21-22_known_de_novo_f-y/Counts.txt", header = T)

transposed = t(cpm(exp_ptgs[, 4:15], log = T))

colnames(transposed) = exp_ptgs$Name
samples = data.frame(samples = rownames(transposed), 
                     tissue = c(rep(c("Female, Flower bud", "Female, Leaf"), 3), rep(c("Male, Flower bud", "Male, Leaf"), 3)),
                     sex = c(rep("Female", 6), rep("Male", 6)),
                     organ = rep(c("Flower bud", "Leaf"), 6)
                       )

pca_exp = PCA(transposed, ncp = 20, graph = F)

pcadf = data.frame(samples, 
                   pca1 = as.data.frame(pca_exp$ind)[,1],
                   pca2 = as.data.frame(pca_exp$ind)[,2])

ggplot(pcadf) +
  geom_hline(yintercept = 0, color="gray40")+
  geom_vline(xintercept = 0, color="gray40")+
  geom_jitter(aes(pca1, pca2, fill=tissue), shape=21, size=8) +
  labs(x="Component 1 (35.67%)", y= "Component 2 (13.39%)", fill= "Tissue") +
  scale_fill_manual(values = c("#3ab98f", "#2ab1c6", "#ee4d80", "#b52ac6")) +
  guides(fill = guide_legend(override.aes = list(size=8))) +
  coord_fixed(ratio = 1.5) +
  theme_bw(base_size = 20) +
  theme(legend.position = c(0.8, 0.7),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        legend.background = element_rect(fill = "gray"),
        aspect.ratio=1)

ggsave("results/pca.png", width = 8, height = 6, dpi = 600)

rm(pcadf, pca_exp)
# sRNA clusters names will be kept as row names, remove columns with ids
matrix_ptgs = exp_ptgs[, 4:15]
row.names(matrix_ptgs) = exp_ptgs$Name

# discard low counts (<0.5 RPM) entries for at least 2 samples
keep = rowSums(cpm(matrix_ptgs)>0.5) >= 2   
summary(keep) # 132 will be removed 
##    Mode   FALSE    TRUE 
## logical     132   45260
matrix_ptgs = matrix_ptgs[keep,]
rm(keep)

# Prepare data info
datainfo = samples[, 2:4]
row.names(datainfo) = samples$samples # colnames in the expression matrix must match 
pyramide = function(raw_matrix, data_info, design_formula, groups_vector){

######### DESEQ 2

# Generate the DESeqDataSet
deseq_object = DESeqDataSetFromMatrix(countData = raw_matrix, colData = data_info, design = design_formula)

# Run Analysis
deseq_object = DESeq(deseq_object)

# Get table
deseq_table = as.data.frame(results(deseq_object, pAdjustMethod = "BH"))

# Get DE clusters that fall under the selection threshold (pvalue < 0.001)
deseq = rownames(filter(deseq_table, pvalue<0.001 & {log2FoldChange>1 | log2FoldChange<(-1)} ) )

######## EDGER

# generate DGEList with grouping by species 
edgeR.DGElist <- DGEList(counts = raw_matrix, group = groups_vector)

# normalize for RNA composition, scaling for the effective library size
edgeR.DGElist <- calcNormFactors(edgeR.DGElist)

# Estimation of  dispersion
edgeR.DGElist <- estimateCommonDisp(edgeR.DGElist, verbose=T)
edgeR.DGElist <- estimateTagwiseDisp(edgeR.DGElist)

# ESTIMATION OF DISPERTION by GLM
design.mat <- model.matrix(~ groups_vector)

# perform DE testing
fit <- glmFit(edgeR.DGElist, design.mat)
lrt <- glmLRT(fit , coef = 2)

# extract  results table
edger_table = topTags(lrt , n = Inf, sort.by = "PValue", adjust.method = "BH")$table

# Extract most differential expressed genes
edger = rownames(filter(edger_table, PValue<0.001 & {logFC>1 | logFC<(-1)} ))

######### LIMA VOOM 

# Redefine object
LimmaVoom.DGElist <- edgeR.DGElist

# normalize for RNA composition, scaling for the effective library size
LimmaVoom.DGElist <- calcNormFactors(LimmaVoom.DGElist)
rownames(design.mat) <- colnames(LimmaVoom.DGElist) # add names to the design matrix from edgeR

# Transform count data to log2-counts per million (logCPM), estimate the mean-variance relationship and 
# use this to compute appropriate observational-level weights. The data are then ready for linear modelling
voomTransformed  <- voom(LimmaVoom.DGElist , design.mat , plot=F)

# fit a linear model for each gene
voomed.fitted  <- lmFit(voomTransformed , design = design.mat)

# compute  statistics
voomed.fitted  <- eBayes(voomed.fitted)

# extract most differential expressed genes and table
lvoom_table = topTable(voomed.fitted, adjust="BH", number = Inf, sort.by="P")
lvoom = rownames(filter(lvoom_table, P.Value<0.001 & {logFC>1 | logFC<(-1)} ))

######## Get the clusters that were diferentially expressed in at least 2 methods
clusters = unique(c(intersect(deseq, edger), intersect(deseq, lvoom), intersect(edger, lvoom)))

####### Print results
list("clusters" = clusters,
     "DeSeq2 table" = deseq_table,
     "EdgeR table" = edger_table,
     "Lima-Voom table" = lvoom_table)
}

Differential Expression for sex-biased/specific siRNAs (21-22nt) in Flowers

# Select samples
fptgs = matrix_ptgs %>% select(grep('B', colnames(matrix_ptgs)))
fptgs_info = datainfo %>% filter(organ=="Flower bud")
# define groups (in the same order as the table)
fgroups = datainfo[datainfo$organ == "Flower bud", ]$sex

flowers = pyramide(fptgs, fptgs_info, ~sex, fgroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.72015 , BCV = 0.8486
## Removing intercept from test coefficients
rm(fptgs, fptgs_info, fgroups)

Differential Expression for sex-biased/specific siRNAs (21-22nt) in Leaves

# Select samples
lptgs = matrix_ptgs %>% select(grep('L', colnames(matrix_ptgs)))
lptgs_info = datainfo %>% filter(organ=="Leaf")
# define groups (in the same order as the table)
lgroups = datainfo[datainfo$organ == "Leaf", ]$sex

leaves = pyramide(lptgs, lptgs_info, ~sex, lgroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.7999 , BCV = 0.8944
## Removing intercept from test coefficients
rm(lptgs, lptgs_info, lgroups)

Differential Expression for tissue-biased/specific siRNAs (21-22nt) in Females

# Select samples
feptgs = matrix_ptgs %>% select(grep('F', colnames(matrix_ptgs)))
feptgs_info = datainfo %>% filter(sex=="Female")
# define groups (in the same order as the table)
fegroups = datainfo[datainfo$sex == "Female", ]$organ

female = pyramide(feptgs, feptgs_info, ~organ, fegroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.66011 , BCV = 0.8125
## Removing intercept from test coefficients
rm(feptgs, feptgs_info, fegroups)

Differential Expression for tissue-biased/specific siRNAs (21-22nt) in Males

# Select samples
maptgs = matrix_ptgs %>% select(grep('M', colnames(matrix_ptgs)))
maptgs_info = datainfo %>% filter(sex=="Male")
# define groups (in the same order as the table)
magroups = datainfo[datainfo$sex == "Male", ]$organ

male = pyramide(maptgs, maptgs_info, ~organ, magroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.85829 , BCV = 0.9264
## Removing intercept from test coefficients
rm(maptgs, maptgs_info, magroups)

Further analysis

ptgs_list = list("Sex-biased in flower buds" = flowers[["clusters"]], 
                  "Sex-biased in leaves" = leaves[["clusters"]], 
                  "Tissue-biased in females" = female[["clusters"]], 
                  "Tissue-biased in males" = male[["clusters"]])

upset(fromList(ptgs_list),
      nintersects = 8,
      mainbar.y.label = "Shared DE sRNA clusters",
      sets.x.label = "DE sRNA clusters", 
      matrix.color = met.brewer("Egypt")[2],
      main.bar.color = met.brewer("Egypt")[3],
      sets.bar.color = met.brewer("Egypt")[4],
      shade.color = met.brewer("Egypt")[2],
      point.size = 10,
      line.size = 4,
      text.scale = c(3, 4, 3, 4, 5, 4),
      mb.ratio = c(0.5, 0.5)
      )

rm(ptgs_list)
# Sex-biased in flowers
baits_flowers = flowers[["clusters"]]
baits_flowers = as.data.frame(cpm(matrix_ptgs, log=TRUE)[baits_flowers, ] ) %>% select(grep('B', colnames(matrix_ptgs))) 
baits_flowers = baits_flowers %>% mutate(cluster = rownames(baits_flowers)) %>% group_by(cluster) %>% summarise(female_exp = mean(c(F1B_dicer, F2B_dicer, F3B_dicer)), 
                                                                                                      male_exp = mean(c(M1B_dicer, M2B_dicer, M4B_dicer)), 
                                                                                                      male_biased = male_exp>female_exp,
                                                                                                      type = "Sex-biased sRNAs in flowers") %>% add_count(type)
# Sex-biased in leaves
baits_leaves = leaves[["clusters"]]
baits_leaves = as.data.frame(cpm(matrix_ptgs, log=TRUE)[baits_leaves, ] ) %>% select(grep('L', colnames(matrix_ptgs))) 
baits_leaves = baits_leaves %>% mutate(cluster = rownames(baits_leaves)) %>% group_by(cluster) %>% summarise(female_exp = mean(c(F1L_dicer, F2L_dicer, F3L_dicer)), 
                                                                                                   male_exp = mean(c(M1L_dicer, M2L_dicer, M4L_dicer)), 
                                                                                                   male_biased = male_exp>female_exp,
                                                                                                   type = "Sex-biased sRNAs in leaves") %>% add_count(type)


bait_df = rbind(baits_flowers, baits_leaves)

# Get mir annotation
anno_ptgs = read.table("raw/only_21-22_known_de_novo_f-y/Results.txt", header = T)
mirnas_ptgs = anno_ptgs[, c("Name", "MIRNA")]

# Merge annotation
bait_df = bait_df %>% left_join(mirnas_ptgs, by = c("cluster"="Name"))
bait_df = bait_df %>% arrange(MIRNA)

levels(as.factor(bait_df$MIRNA)) # 3 miRNA
## [1] "N" "Y"
write.table(bait_df, file ="results/DEsRNAClusters.tsv",  quote = F, col.names = F, sep = "\t", row.names = F)

# Plot all
ggplot(bait_df) +
  annotate(geom = "polygon", x = c(-Inf, Inf, Inf), y = c(-Inf, Inf, -Inf), fill = "gray8", alpha = 0.2 ) +
  geom_abline(intercept = 0, slope = 1, alpha=0.6, color="red2", size=1, linetype="dashed") +
  geom_jitter(aes(female_exp, male_exp, fill=male_biased, shape=MIRNA, color=MIRNA), size=4) +
  geom_text(aes(x = -Inf, y = -Inf, label= paste0("n = ", n)), hjust = -.5, vjust   = -28) +
  scale_x_continuous(limits = c(-2.5, 10)) +
  scale_y_continuous(limits = c(-2.5, 10)) +
  labs(x= "♀ logRPM", y="♂ logRPM") +
  scale_fill_manual(values = met.brewer("Derain")) +
  scale_shape_manual(values = c(21, 19)) +
  scale_color_manual(values = c("black", "#2471a3")) +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) +
  facet_grid(~type)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1908 rows containing missing values (`geom_point()`).

ggsave("results/desrnas.png", width = 8, height = 8, dpi = 600)
## Warning: Removed 1908 rows containing missing values (`geom_point()`).
rm(baits_flowers, baits_leaves, mirnas_ptgs)

3.- Differential expression of genes

# clean and transform data
kall = read.table(file = "rna-seq/kallisto/raw_counts.tsv", header = T)
rownames(kall) = kall$target_id
kall = kall[, -1]

# Round counts cause Kallisto returns decimals after the bootstrapping
kall = round(kall)

# discard low counts (<0.5 RPM) entries for at least 2 samples
keep = rowSums(cpm(kall)>0.5) >= 2  
summary(keep) # 590 genes will be removed 
##    Mode   FALSE    TRUE 
## logical     590   12481
kall = kall[keep,]
rm(keep)

# Prepare data info
# Four female plants (C1_26, C1_27, C1_29, C1_34) and four males (C1_01, C1_03, C1_04, C1_05) were sequenced for flower buds ("_B_" in names) and leaves ("_L_" in names). 

kall_info = data.frame(sex = c(rep("Male", 8), rep("Female", 8)),
                       organ = rep(c("Flower bud", "Leaf"), 8),
                       row.names = colnames(kall))

# check all samples are included and in the same order

all(colnames(kall) %in% rownames(kall_info))
## [1] TRUE
all(colnames(kall) == rownames(kall_info))
## [1] TRUE
# Read TPM data and transform
tpm = read.table("rna-seq/kallisto/tpm.tsv", header = T)
rownames(tpm) = tpm$target_id
tpm = tpm[, -1]
# transform data
transposed = t(tpm)

pca_exp = PCA(transposed, ncp = 20, graph = F)
pcadf = data.frame(tissue = paste0(kall_info$sex, ", ", kall_info$organ), 
                   pca1 = as.data.frame(pca_exp$ind)[,1],
                   pca2 = as.data.frame(pca_exp$ind)[,2])

ggplot(pcadf) +
  geom_hline(yintercept = 0, color="gray40")+
  geom_vline(xintercept = 0, color="gray40")+
  geom_jitter(aes(pca1, pca2, fill=tissue), shape=21, size=8) +
  labs(x="Component 1 (35.67%)", y= "Component 2 (13.39%)", fill= "Tissue") +
  scale_fill_manual(values = c("#3ab98f", "#2ab1c6", "#ee4d80", "#b52ac6")) +
  guides(fill = guide_legend(override.aes = list(size=8))) +
  coord_fixed(ratio = 1.5) +
  theme_bw(base_size = 20) +
  theme(legend.position = c(0.25, 0.8),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        legend.background = element_rect(fill = "gray"),
        aspect.ratio=1)

ggsave("results/pca_sbge.png", width = 8, height = 6, dpi = 600)

rm(pcadf, pca_exp)

Differential Expression for sex-biased genes in Flowers

# Select flower bud samples
fsbge = kall %>% select(grep('B', colnames(kall)))
fsbge_info = kall_info %>% filter(organ=="Flower bud")
# define groups (in the same order as the table)
fgroups = kall_info[kall_info$organ == "Flower bud", ]$sex

flowers_genes = pyramide(fsbge, fsbge_info, ~sex, fgroups)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.06511 , BCV = 0.2552
## Removing intercept from test coefficients
rm(fsbge, fsbge_info, fgroups)

Differential Expression for sex-biased genes in Leaves

# Select leaf samples
lsbge = kall %>% select(grep('L', colnames(kall)))
lsbge_info = kall_info %>% filter(organ=="Leaf")
# define groups (in the same order as the table)
lgroups = kall_info[kall_info$organ == "Leaf", ]$sex

leaves_genes = pyramide(lsbge, lsbge_info, ~sex, lgroups)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.16956 , BCV = 0.4118
## Removing intercept from test coefficients
rm(lsbge, lsbge_info, lgroups)

Differential Expression for tissue-biased genes in Females

# Select female samples
fet_sbge = kall %>% select(rownames(kall_info[kall_info$sex=="Female", ]))
fet_sbge_info = kall_info %>% filter(sex=="Female")
# define groups (in the same order as the table)
fegroups = kall_info[kall_info$sex == "Female", ]$organ

female_genes = pyramide(fet_sbge, fet_sbge_info, ~organ, fegroups)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.13265 , BCV = 0.3642
## Removing intercept from test coefficients
rm(fet_sbge, fet_sbge_info, fegroups)

Differential Expression for tissue-biased genes in Males

# Select male samples
mt_sbge = kall %>% select(rownames(kall_info[kall_info$sex=="Male", ]))
mt_sbge_info = kall_info %>% filter(sex=="Male")
# define groups (in the same order as the table)
mgroups = kall_info[kall_info$sex == "Male", ]$organ

male_genes = pyramide(mt_sbge, mt_sbge_info, ~organ, mgroups)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.1018 , BCV = 0.3191
## Removing intercept from test coefficients
rm(mt_sbge, mt_sbge_info, mgroups)

Intersections and plot

sbge_list = list("Sex-biased in flower buds" = flowers_genes[["clusters"]], 
                  "Sex-biased in leaves" = leaves_genes[["clusters"]], 
                  "Tissue-biased in females" = female_genes[["clusters"]], 
                  "Tissue-biased in males" = male_genes[["clusters"]])

upset(fromList(sbge_list),
      nintersects = 8,
      mainbar.y.label = "Shared DE genes",
      sets.x.label = "DE genes", 
      matrix.color = met.brewer("Egypt")[2],
      main.bar.color = met.brewer("Egypt")[3],
      sets.bar.color = met.brewer("Egypt")[4],
      shade.color = met.brewer("Egypt")[2],
      point.size = 10,
      line.size = 4,
      text.scale = c(3, 4, 3, 4, 5, 4),
      mb.ratio = c(0.6, 0.4))

rm(sbge_list)
# Sex-biased in flowers
baits_flowers = flowers_genes[["clusters"]]
baits_flowers = as.data.frame(tpm[baits_flowers, ]) %>% select(grep('B', colnames(tpm))) 
baits_flowers = baits_flowers %>% mutate(gene = rownames(baits_flowers)) %>% group_by(gene) %>% summarise(female_exp = mean(c(C1_26_B, C1_27_B, C1_29_B_combined, C1_34_B_combined)),
                                                                                                          male_exp = mean(c(C1_01_B, C1_03_B, C1_04_B_combined, C1_05_B_combined)),
                                                                                                          male_biased = male_exp>female_exp,
                                                                                                          type = "Sex-biased genes in flowers") %>% add_count(type)

# Sex-biased in leaves
baits_leaves = leaves_genes[["clusters"]]
baits_leaves = as.data.frame(tpm[baits_leaves, ]) %>% select(grep('L', colnames(tpm))) 
baits_leaves = baits_leaves %>% mutate(gene = rownames(baits_leaves)) %>% group_by(gene) %>% summarise(female_exp = mean(c(C1_26_L, C1_27_L, C1_29_L, C1_34_L)),
                                                                                                       male_exp = mean(c(C1_01_L, C1_03_L, C1_04_L, C1_05_L)),
                                                                                                       male_biased = male_exp>female_exp,
                                                                                                       type = "Sex-biased genes in leaves") %>% add_count(type)


# Merge
bait_df = rbind(baits_flowers, baits_leaves)

# Plot all
ggplot(bait_df) +
  annotate(geom = "polygon", x = c(-Inf, Inf, Inf), y = c(-Inf, Inf, -Inf), fill = "gray8", alpha = 0.2 ) +
  geom_abline(intercept = 0, slope = 1, alpha=0.6, color="red2", size=1, linetype="dashed") +
  geom_jitter(aes(female_exp, male_exp, fill=male_biased), shape=21, size=4) +
  geom_text(aes(x = -Inf, y = -Inf, label= paste0("n = ", n)), hjust = -3, vjust = -28) +
  scale_x_continuous(limits = c(-2.5, 10)) +
  scale_y_continuous(limits = c(-2.5, 10)) +
  labs(x= "♀ TPM", y="♂ TPM") +
  scale_fill_manual(values = met.brewer("Derain")) +
  scale_y_continuous(limits = c(0,10)) +
  scale_x_continuous(limits = c(0,10)) +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) +
  facet_grid(~type)
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Warning: Removed 1332 rows containing missing values (`geom_point()`).

ggsave("results/sbge.png", width = 8, height = 8, dpi = 600)
## Warning: Removed 1329 rows containing missing values (`geom_point()`).
rm(baits_leaves, baits_flowers)

4.- Post-transcriptional Gene Silencing

# formatting of the data
sRNA_clusters = as.data.table(read.table("raw/only_21-22_known_de_novo_f-y/Results.txt", header = T)) # transform to data table
sRNA_clusters[, min := min(Start, End), by=1:nrow(sRNA_clusters)] # unstrand start 
sRNA_clusters[, max := max(Start, End), by=1:nrow(sRNA_clusters)] # unstrand end

# get gene annotations, already including +-400 bp
genesCoordinates = fread("annotation/full_genes.bed")
colnames(genesCoordinates) = c("chr", "start", "end", "gene_name", "score", "strand")
genesCoordinates[, min := min(start, end), by=1:nrow(genesCoordinates)]
genesCoordinates[, max := max(start, end), by=1:nrow(genesCoordinates)]
genesCoordinates = genesCoordinates[!duplicated(genesCoordinates$gene_name),] # remove duplicates

# select for sex-DE of genes/sRNAs in Flowers
sRNA_clusters_bait = sRNA_clusters[Name %in% flowers[["clusters"]]] # Select DE sNRAs
genesCoordinates_bait = genesCoordinates[gene_name %in% flowers_genes[["clusters"]]] # select DE genes

# Initiate data.table containing the header
header = list('gene_name', 'overlapping_cluster', 'comparison')
fwrite(header, 'results/overlaps/overlapping_clusters_ptgs.txt', append = FALSE, sep='\t')

# For each gene, get all overlapping siRNA clusters
for (i in seq(1, nrow(genesCoordinates_bait))) {
  # Current gene info
  CurrentGene <- genesCoordinates_bait$gene_name[i]
  CurrentChr <- genesCoordinates_bait$chr[i]
  Currentmin <- genesCoordinates_bait$min[i]
  Currentmax <- genesCoordinates_bait$max[i]
  # make a subset of the siRNA cluster data table for clusters overlapping the gene
  # overlap happens if cluster minimum is lower than gene maximum AND cluster maximum is higher than gene minimum
  siRNAsubset <- sRNA_clusters_bait[(Chrom == CurrentChr)&(min < Currentmax)&(max > Currentmin)]
  # write a line with output and append it to output data.table
#  line <- list(CurrentGene, nrow(siRNAsubset))
  line = as.data.frame(siRNAsubset$Name) %>% mutate(CurrentGene, "sex_biased_in_flowers") %>% relocate(CurrentGene)
  fwrite(line, 'results/overlaps/overlapping_clusters_ptgs.txt', append = TRUE, sep='\t')
}
###########################################################################################################################
# select for sex-DE of genes/sRNAs in leaves
sRNA_clusters_bait = sRNA_clusters[Name %in% leaves[["clusters"]]] # Select DE sNRAs
genesCoordinates_bait = genesCoordinates[gene_name %in% leaves_genes[["clusters"]]] # select DE genes

# For each gene, get all overlapping siRNA clusters
for (i in seq(1, nrow(genesCoordinates_bait))) {
  # Current gene info
  CurrentGene <- genesCoordinates_bait$gene_name[i]
  CurrentChr <- genesCoordinates_bait$chr[i]
  Currentmin <- genesCoordinates_bait$min[i]
  Currentmax <- genesCoordinates_bait$max[i]
  # make a subset of the siRNA cluster data table for clusters overlapping the gene
  # overlap happens if cluster minimum is lower than gene maximum AND cluster maximum is higher than gene minimum
  siRNAsubset <- sRNA_clusters_bait[(Chrom == CurrentChr)&(min < Currentmax)&(max > Currentmin)]
  # write a line with output and append it to output data.table
#  line <- list(CurrentGene, nrow(siRNAsubset))
  line = as.data.frame(siRNAsubset$Name) %>% mutate(CurrentGene, "sex_biased_in_leaves") %>% relocate(CurrentGene)
  fwrite(line, 'results/overlaps/overlapping_clusters_ptgs.txt', append = TRUE, sep='\t')
}
# transform and add info to sRNA quantification
transposed = transpose(matrix_ptgs)
colnames(transposed) = rownames(matrix_ptgs)
rownames(transposed) = colnames(matrix_ptgs)
transposed = cpm(transposed, log = T) # transform to logRPM
# add samples as column
transposed = as.data.frame(transposed) %>% mutate(sample = rownames(transposed)) %>% relocate(sample) 
datainfo_s = datainfo %>% mutate(sample = rownames(datainfo)) %>% relocate(sample) 
# combine 
fermented_srna = right_join(datainfo_s, transposed, by = "sample")

# transform and add info to gene quant
fermented_genes = transpose(tpm)
colnames(fermented_genes) = rownames(tpm)
rownames(fermented_genes) = colnames(tpm)
fermented_genes = as.data.frame(fermented_genes) %>% mutate(sample = rownames(fermented_genes)) %>% relocate(sample) 
datainfo_s = kall_info %>% mutate(sample = rownames(kall_info)) %>% relocate(sample) 
fermented_genes = right_join(datainfo_s, fermented_genes, by = "sample")


# Get overlapping clusters
over_flowers = read_table("results/overlaps/flowers.tsv", col_names = F)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   X2 = col_character()
## )
colnames(over_flowers) = c("gene_name", "cluster")
over_flowers = filter(over_flowers, gene_name=="Silat_chr12_Gene.1440")

# leaves
over_flowers = read_table("results/overlaps/leaves.tsv", col_names = F)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   X2 = col_character()
## )
colnames(over_flowers) = c("gene_name", "cluster")
over_flowers = filter(over_flowers, gene_name=="Silat_scaffold_13_Gene.43660", cluster=="Cluster_45289") # TE with precursor
# over_flowers = filter(over_flowers, gene_name=="Silat_chr12_Gene.3592") # with overlap


# experiment 1, plot 
fermented_gene_bait = subset(fermented_genes, select = c("sample", "sex", "organ", levels(as.factor(over_flowers$gene_name)))) %>% mutate(element = "gene")
fermented_gene_bait = melt(fermented_gene_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")
fermented_srna_bait = subset(fermented_srna, select = c("sample", "sex", "organ", levels(as.factor(over_flowers$cluster)))) %>% mutate(element = "srna")
fermented_srna_bait = melt(fermented_srna_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")

dough = rbind(fermented_gene_bait, fermented_srna_bait)

dough %>% group_by(name, sex, organ) %>% summarise(mean = mean(expression), sex, name, element, organ) %>% unique() %>%
ggplot(aes(x=sex)) +
  facet_wrap(~organ, nrow = 1) +
  geom_point(aes(y= mean, color = name), size=4) +
  geom_line(aes(y= mean, group = name, color=name), size=2) +
  scale_color_manual(values = met.brewer("Egypt")) +
  labs(x=NULL, y="Mean Expression", color=NULL) +
  theme(legend.position = "top",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'name', 'sex', 'organ'. You can override
## using the `.groups` argument.

#ggsave("results/bait_exploration.png", width = 8, height = 8, dpi = 600)
#ggsave("results/flower_candidate_precursor.png", width = 8, height = 8, dpi = 600)
ggsave("results/leaves_candidate_precursor.png", width = 8, height = 8, dpi = 600)
# get chr boundaries again
chr_sizes = read.table("chromSizes.txt")
colnames(chr_sizes) = c("chr", "end")
chr_sizes = chr_sizes %>% mutate(start = "1") %>% relocate(start, .before = end)

# load gene bed
gene_bed = read.table("annotation/genes.bed")
colnames(gene_bed) = c("chr", "start", "end", "gene", "score", "strand")

# Distribution of inverted repeats and TE annotation
#inv_bed = read.table("annotation/inv_rep.bed")
#tes_bed =  read.table("annotation/tes.bed", sep = "\t")

# load candidate info in flowers
cand_flowers = read.table("results/overlaps/flowers_precursos.tsv")
colnames(cand_flowers) = c("inv_rep", "chr", "start", "end", "sRNA", "gene")

# merge info
cand_flowers = left_join(cand_flowers, gene_bed, by="gene")

# plot 
{
png(file="results/candidate_flowers.png", width=1000, height=1000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=1)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
#circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
#circos.genomicDensity(tes_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "number")
#circos.genomicDensity(inv_bed, col = "blue", track.height = 0.1, count_by = "number")
circos.genomicLink(region1 = cand_flowers[, c("chr.x", "start.x", "end.x")], region2 = cand_flowers[, c("chr.y", "start.y", "end.y")], col = "#5EBF75", lwd=6, direction=1, arr.width=0.5)
circos.clear()
dev.off()
}
## png 
##   2
# load candidate info in leaves
cand_leaves = read.table("results/overlaps/leaves_precursos.tsv")
colnames(cand_leaves) = c("inv_rep", "chr", "start", "end", "sRNA", "gene")

# merge gene info
cand_leaves = left_join(cand_leaves, gene_bed, by="gene")

# plot 
{
png(file="results/candidate_leaves.png", width=2000, height=2000)
circos.par("clock.wise"=TRUE, start.degree=86, gap.degree=c(rep(1,18), 8))
circos.genomicInitialize(chr_sizes, plotType = c("axis"), axis.labels.cex=2)
circos.track(ylim = c(0, 0.05), bg.border = NA, bg.col = c(viridis::inferno(11, begin=0.15, end=0.9, direction = -1), "gray", rep("black", 8)) , track.height = 0.05)
#circos.genomicDensity(gene_bed, col = c("#ee4d80"), track.height = 0.1, count_by = "number")
#circos.genomicDensity(tes_bed, col = c("#b52ac6"), track.height = 0.1, count_by = "number")
#circos.genomicDensity(inv_bed, col = "blue", track.height = 0.1, count_by = "number")
circos.genomicLink(region1 = cand_leaves[, c("chr.x", "start.x", "end.x")], region2 = cand_leaves[, c("chr.y", "start.y", "end.y")], col = "#5EBF75", lwd=6, direction=1, arr.width=1)
circos.clear()
dev.off()
}
## png 
##   2
rm(inv_bed, tes_bed, cand_flowers, cand_leaves, gene_bed)
## Warning in rm(inv_bed, tes_bed, cand_flowers, cand_leaves, gene_bed): object
## 'inv_bed' not found
## Warning in rm(inv_bed, tes_bed, cand_flowers, cand_leaves, gene_bed): object
## 'tes_bed' not found
# sex-biased PTGS interaction
chi_flower = matrix(c(c(10, 758),
                      c(32, 1764)),
                    nrow = 2, byrow = T)

colnames(chi_flower) = c("number of overlaps to sex-biased clusters", "number of overlaps to unbiased clusters")
rownames(chi_flower) = c("sex-biased genes", "unbiased genes")

chi_leaves = matrix(c(c(3, 38),
                      c(27, 2220)),
                    nrow = 2, byrow = T)
colnames(chi_leaves) = c("number of overlaps to sex-biased clusters", "number of overlaps to unbiased clusters")
rownames(chi_leaves) = c("sex-biased genes", "unbiased genes")

# Perform chi-square test
chisq.test(chi_flower) #the observed values don't significantly differ from the expected values
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chi_flower
## X-squared = 0.4993, df = 1, p-value = 0.4798
chisq.test(chi_leaves) #the observed values significantly differ from the expected values
## Warning in chisq.test(chi_leaves): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chi_leaves
## X-squared = 7.3912, df = 1, p-value = 0.006554
rm(chi_flower, chi_leaves)
# We calculate depths for each sex-biased gene in Flowers

surco_gene = function(toy, lib_size_1, lib_size_2, lib_size_3) {
          # set column names
          colnames(toy) = c("chr", "start", "end", "depth_1", "depth_2", "depth_3", "gene")
          # calculate mead depth per window, normalize by library size
          toy[, mean_depth_1 := mean(depth_1)/lib_size_1, by=.(gene)]
          toy[, mean_depth_2 := mean(depth_2)/lib_size_2, by=.(gene)]
          toy[, mean_depth_3 := mean(depth_3)/lib_size_3, by=.(gene)]
          # get one row per window
          toy = unique(toy, by = "gene")[, .(gene, mean_depth_1, mean_depth_2, mean_depth_3)]
          # overall mean and multiply by 1m
          toy[, mean_depth := (mean_depth_1 + mean_depth_2 + mean_depth_3)*(1000000/3), by=1:nrow(toy)]
          toy[, log_depth := log2(mean_depth), by=1:nrow(toy)]
          # print bed file + mean depth per million
          toy[, .(gene, log_depth, mean_depth)]
}

# Females
flo_sbg_fe_d = fread("results/gene-level/flowers_depth_females.tsv")
flo_sbg_fe_d = surco_gene(flo_sbg_fe_d, 54935002, 29689313, 25245689)
colnames(flo_sbg_fe_d) = c("gene", "log_female_exp", "female_exp")

# Males
flo_sbg_ma_d = fread("results/gene-level/flowers_depth_males.tsv")
flo_sbg_ma_d = surco_gene(flo_sbg_ma_d, 13224600, 23943681, 23582102)
colnames(flo_sbg_ma_d) = c("gene", "log_male_exp", "male_exp")

flo_depth = full_join(flo_sbg_fe_d, flo_sbg_ma_d, by = "gene")

flo_depth$log_female_exp[is.na(flo_depth$log_female_exp)] = -5 # replace NA for "0" mapping
flo_depth$log_male_exp[is.na(flo_depth$log_male_exp)] = -5

# Plot

ggplot(flo_depth) +
  annotate(geom = "polygon", x = c(-Inf, Inf, Inf), y = c(-Inf, Inf, -Inf), fill = "gray8", alpha = 0.2 ) +
  geom_abline(intercept = 0, slope = 1, alpha=0.6, color="red2", size=1, linetype="dashed") +
  geom_jitter(aes(log_female_exp, log_male_exp), color=met.brewer("Derain", 1), size=4) +
  scale_x_continuous(limits = c(-5, 4)) +
  scale_y_continuous(limits = c(-5, 4)) +
  labs(x= "♀ log2RPM sRNA", y="♂ log2RPM sRNA") +
  scale_fill_manual(values = met.brewer("Derain")) +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 
## Warning: Removed 909 rows containing missing values (`geom_point()`).

ggsave("results/gene-level_mapping.png", width = 8, height = 6, dpi = 600)
## Warning: Removed 913 rows containing missing values (`geom_point()`).
rm(flo_depth, flo_sbg_fe_d, flo_sbg_ma_d)

5.- RNA-directed Methylation

Data prep and exploration

exp_rddm = read.table("raw/only_24_f-y/Counts.txt", header = T)

# sRNA clusters names will be kept as row names, remove columns with ids
matrix_rddm = exp_rddm[, 4:15]
row.names(matrix_rddm) = paste(exp_rddm$Name, "_RdDM", sep = "") # so we can differentiate from PTGS

# discard low counts (<0.5 RPM) entries for at least 2 samples
keep = rowSums(cpm(matrix_rddm)>0.5) >= 2   
summary(keep) # 132 will be removed 
##    Mode   FALSE    TRUE 
## logical     132   45256
matrix_rddm = matrix_rddm[keep,]
rm(keep)

# we take the data info from PTGS, assuming it is in the environment
### All together

transposed = t(cpm(exp_rddm[, 4:15], log = T))

colnames(transposed) = exp_rddm$Name

pca_exp = PCA(transposed, ncp = 20, graph = F)

pcadf = data.frame(samples, 
                   pca1 = as.data.frame(pca_exp$ind)[,1],
                   pca2 = as.data.frame(pca_exp$ind)[,2])

ggplot(pcadf) +
  geom_hline(yintercept = 0, color="gray40")+
  geom_vline(xintercept = 0, color="gray40")+
  geom_jitter(aes(pca1, pca2, fill=tissue), shape=21, size=8) +
  labs(x="Component 1 (35.67%)", y= "Component 2 (13.39%)", fill= "Tissue") +
  scale_fill_manual(values = c("#3ab98f", "#2ab1c6", "#ee4d80", "#b52ac6")) +
  guides(fill = guide_legend(override.aes = list(size=8))) +
  coord_fixed(ratio = 1.5) +
  theme_bw(base_size = 20) +
  theme(legend.position = c(0.8, 0.7),
        legend.text = element_text(size = 10),
        legend.title = element_text(size = 10),
        legend.background = element_rect(fill = "gray"),
        aspect.ratio=1)

ggsave("results/pca_rddm.png", width = 8, height = 6, dpi = 600)

rm(pcadf, pca_exp)

Differential Expression for sex-biased 24nt sRNAs in flowers

# Select samples
frddm = matrix_rddm %>% select(grep('B', colnames(matrix_rddm)))
frddm_info = datainfo %>% filter(organ=="Flower bud")

# define groups (in the same order as the table)
fgroups = datainfo[datainfo$organ == "Flower bud", ]$sex

flowers_rddm = pyramide(frddm, frddm_info, ~sex, fgroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.72015 , BCV = 0.8486
## Removing intercept from test coefficients
rm(frddm, frddm_info, fgroups)

Differential Expression for sex-biased 24nt sRNAs in leaves

# Select samples
lrddm = matrix_rddm %>% select(grep('L', colnames(matrix_rddm)))
lrddm_info = datainfo %>% filter(organ=="Leaf")

# define groups (in the same order as the table)
lgroups = datainfo[datainfo$organ == "Leaf", ]$sex

leaves_rddm = pyramide(lrddm, lrddm_info, ~sex, lgroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## Disp = 0.79975 , BCV = 0.8943
## Removing intercept from test coefficients
rm(lrddm, lrddm_info, lgroups)

Differential Expression for tissue-biased 24nt sRNAs in females

# Select samples
ferddm = matrix_rddm %>% select(grep('F', colnames(matrix_rddm)))
ferddm_info = datainfo %>% filter(sex=="Female")

# define groups (in the same order as the table)
fegroups = datainfo[datainfo$sex == "Female", ]$organ

females_rddm = pyramide(ferddm, ferddm_info, ~organ, fegroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.6601 , BCV = 0.8125
## Removing intercept from test coefficients
rm(ferddm, ferddm_info, fegroups)

Differential Expression for tissue-biased 24nt sRNAs in males

# Select samples
marddm = matrix_rddm %>% select(grep('M', colnames(matrix_rddm)))
marddm_info = datainfo %>% filter(sex=="Male")

# define groups (in the same order as the table)
magroups = datainfo[datainfo$sex == "Male", ]$organ

males_rddm = pyramide(marddm, marddm_info, ~organ, magroups)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating size factors
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
## Disp = 0.85816 , BCV = 0.9264
## Removing intercept from test coefficients
rm(marddm, marddm_info, magroups)

Further analysis

rddm_list = list("Sex-biased in flower buds" = flowers_rddm[["clusters"]], 
                  "Sex-biased in leaves" = leaves_rddm[["clusters"]], 
                  "Tissue-biased in females" = females_rddm[["clusters"]], 
                  "Tissue-biased in males" = males_rddm[["clusters"]])

upset(fromList(rddm_list),
      nintersects = 8,
      mainbar.y.label = "Shared DE sRNA clusters",
      sets.x.label = "DE sRNA clusters", 
      matrix.color = met.brewer("Egypt")[2],
      main.bar.color = met.brewer("Egypt")[3],
      sets.bar.color = met.brewer("Egypt")[4],
      shade.color = met.brewer("Egypt")[2],
      point.size = 10,
      line.size = 4,
      text.scale = c(3, 4, 3, 4, 5, 4)
      )

rm(rddm_list)
# Sex-biased in flowers
baits_flowers = flowers_rddm[["clusters"]]
baits_flowers = as.data.frame(cpm(matrix_rddm, log=TRUE)[baits_flowers, ] ) %>% select(grep('B', colnames(matrix_rddm))) 
baits_flowers = baits_flowers %>% mutate(cluster = rownames(baits_flowers)) %>% group_by(cluster) %>% summarise(female_exp = mean(c(F1B_dicer, F2B_dicer, F3B_dicer)), 
                                                                                                      male_exp = mean(c(M1B_dicer, M2B_dicer, M4B_dicer)), 
                                                                                                      male_biased = male_exp>female_exp,
                                                                                                      type = "Sex-biased sRNAs in flowers") %>% add_count(type)
# Sex-biased in leaves
baits_leaves = leaves_rddm[["clusters"]]
baits_leaves = as.data.frame(cpm(matrix_rddm, log=TRUE)[baits_leaves, ] ) %>% select(grep('L', colnames(matrix_rddm))) 
baits_leaves = baits_leaves %>% mutate(cluster = rownames(baits_leaves)) %>% group_by(cluster) %>% summarise(female_exp = mean(c(F1L_dicer, F2L_dicer, F3L_dicer)), 
                                                                                                   male_exp = mean(c(M1L_dicer, M2L_dicer, M4L_dicer)), 
                                                                                                   male_biased = male_exp>female_exp,
                                                                                                   type = "Sex-biased sRNAs in leaves") %>% add_count(type)


bait_df = rbind(baits_flowers, baits_leaves)

# Plot all
ggplot(bait_df) +
  annotate(geom = "polygon", x = c(-Inf, Inf, Inf), y = c(-Inf, Inf, -Inf), fill = "gray8", alpha = 0.2 ) +
  geom_abline(intercept = 0, slope = 1, alpha=0.6, color="red2", size=1, linetype="dashed") +
  geom_jitter(aes(female_exp, male_exp, fill=male_biased), size=4, shape=21, color="black") +
  geom_text(aes(x = -Inf, y = -Inf, label= paste0("n = ", n)), hjust = -.5, vjust   = -28) +
  scale_x_continuous(limits = c(-2.5, 10)) +
  scale_y_continuous(limits = c(-2.5, 10)) +
  labs(x= "♀ logRPM", y="♂ logRPM") +
  scale_fill_manual(values = met.brewer("Derain")) +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) +
  facet_grid(~type)
## Warning: Removed 1912 rows containing missing values (`geom_point()`).

ggsave("results/desrnas_rddm.png", width = 8, height = 8, dpi = 600)
## Warning: Removed 1912 rows containing missing values (`geom_point()`).
rm(baits_flowers, baits_leaves)
# transform and add info to sRNA quantification
transposed = transpose(matrix_rddm)
colnames(transposed) = rownames(matrix_rddm)
rownames(transposed) = colnames(matrix_rddm)
transposed = cpm(transposed, log = T) # transform to logRPM
# add samples as column
transposed = as.data.frame(transposed) %>% mutate(sample = rownames(transposed)) %>% relocate(sample) 
datainfo_s = datainfo %>% mutate(sample = rownames(datainfo)) %>% relocate(sample) 
# combine 
fermented_rddm = right_join(datainfo_s, transposed, by = "sample")

# transform and add info to gene quant
fermented_genes = transpose(kall)
colnames(fermented_genes) = rownames(kall)
rownames(fermented_genes) = colnames(kall)
fermented_genes = cpm(fermented_genes, log = T) # transform to logTPM
fermented_genes = as.data.frame(fermented_genes) %>% mutate(sample = rownames(fermented_genes)) %>% relocate(sample) 
datainfo_s = kall_info %>% mutate(sample = rownames(kall_info)) %>% relocate(sample) 
fermented_genes = right_join(datainfo_s, fermented_genes, by = "sample")

# Get overlapping clusters
over_flowers = read_table("results/rddm/flowers.tsv", col_names = F)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   X2 = col_character()
## )
colnames(over_flowers) = c("cluster", "gene_name")
over_flowers = over_flowers %>% relocate(gene_name)
# add an ID to each intersection (overlap gene and sRNA)
over_flowers = over_flowers %>% mutate(overlap = paste(gene_name, cluster, sep = " // "))

# get expression data of the genes 
fermented_gene_bait = subset(fermented_genes, select = c("sample", "sex", "organ", levels(as.factor(over_flowers$gene_name)))) %>% mutate(element = "gene")
fermented_gene_bait = filter(fermented_gene_bait, organ=="Flower bud")
fermented_gene_bait = melt(fermented_gene_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")
# get overlap ID
fermented_gene_bait = left_join(fermented_gene_bait, over_flowers[,c("gene_name", "overlap")], by=join_by("name"=="gene_name")) #somehow it gets filled in missing samples?
## Warning in left_join(fermented_gene_bait, over_flowers[, c("gene_name", : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 73 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# mow for the sRNA
fermented_rddm_bait = subset(fermented_rddm, select = c("sample", "sex", "organ", levels(as.factor(over_flowers$cluster)))) %>% mutate(element = "srna")
fermented_rddm_bait = filter(fermented_rddm_bait, organ=="Flower bud")
fermented_rddm_bait = melt(fermented_rddm_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")
# get overlap ID
fermented_rddm_bait = left_join(fermented_rddm_bait, over_flowers[,c("cluster", "overlap")], by=join_by("name"=="cluster"))

# merge data
dough = rbind(fermented_gene_bait, fermented_rddm_bait)

# plot
dough %>% group_by(name, sex, organ) %>% summarise(mean = mean(expression), sex, name, element, organ, overlap) %>% unique() %>%
ggplot(aes(x=sex)) +
  facet_wrap(~overlap, nrow = 5, ncol=5) +
  geom_point(aes(y= mean, color = element), size=4) +
  geom_line(aes(y= mean, group = name, color=element), size=2) +
  scale_color_manual(values = met.brewer("Egypt")) +
  labs(x=NULL, y="Mean Expression", color=NULL) +
  theme(legend.position = "top",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'name', 'sex', 'organ'. You can override
## using the `.groups` argument.

ggsave("results/rddm_flowers_candidates.png", width = 36, height = 36, dpi = 600)

# plot only the best candidates (manual selection)

best_flowers = filter(over_flowers, gene_name %in% c("Silat_chr1_Gene.4771",
                                                     "Silat_chr12_Gene.14608",
                                                     "Silat_chr12_Gene.26394",
                                                     "Silat_chr2_Gene.30622",
                                                     "Silat_chr3_Gene.37746",
                                                     "Silat_chr9_Gene.46125"))$overlap

dough %>% filter(overlap %in% best_flowers) %>% group_by(name, sex, organ) %>% summarise(mean = mean(expression), sex, name, element, organ, overlap) %>% unique() %>% left_join(over_flowers[,c("gene_name", "overlap")], by="overlap") %>%
ggplot(aes(x=sex)) +
  facet_wrap(~gene_name, nrow = 2, ncol=3) +
  geom_point(aes(y= mean, color = element), size=4) +
  geom_line(aes(y= mean, group = name, color=element), size=2) +
  scale_color_manual(values = met.brewer("Egypt")) +
  labs(x=NULL, y="Mean Expression", color=NULL) +
  theme(legend.position = "top",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'name', 'sex', 'organ'. You can override
## using the `.groups` argument.

ggsave("results/rddm_flowers_candidates_best.png", width = 14, height = 8, dpi = 600)

###########################################################################

# now for leaves
over_leaves = read_table("results/rddm/leaves.tsv", col_names = F)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   X1 = col_character(),
##   X2 = col_character()
## )
colnames(over_leaves) = c("cluster", "gene_name")
over_leaves = over_leaves %>% relocate(gene_name)
# add an ID to each intersection (overlap gene and sRNA)
over_leaves = over_leaves %>% mutate(overlap = paste(gene_name, cluster, sep = " // "))

# get expression data of the genes 
fermented_gene_bait = subset(fermented_genes, select = c("sample", "sex", "organ", levels(as.factor(over_leaves$gene_name)))) %>% mutate(element = "gene")
fermented_gene_bait = filter(fermented_gene_bait, organ=="Leaf")
fermented_gene_bait = melt(fermented_gene_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")
# get overlap ID
fermented_gene_bait = left_join(fermented_gene_bait, over_leaves[,c("gene_name", "overlap")], by=join_by("name"=="gene_name")) #somehow it gets filled in missing samples?
## Warning in left_join(fermented_gene_bait, over_leaves[, c("gene_name", "overlap")], : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 89 of `x` matches multiple rows in `y`.
## ℹ Row 4 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# mow for the sRNA
fermented_rddm_bait = subset(fermented_rddm, select = c("sample", "sex", "organ", levels(as.factor(over_leaves$cluster)))) %>% mutate(element = "srna")
fermented_rddm_bait = filter(fermented_rddm_bait, organ=="Leaf")
fermented_rddm_bait = melt(fermented_rddm_bait, id.vars = c("sample", "sex", "organ", "element"), variable.name = "name", value.name = "expression")
# get overlap ID
fermented_rddm_bait = left_join(fermented_rddm_bait, over_leaves[,c("cluster", "overlap")], by=join_by("name"=="cluster"))

# merge data
dough = rbind(fermented_gene_bait, fermented_rddm_bait)

# plot
dough %>% group_by(name, sex, organ) %>% summarise(mean = mean(expression), sex, name, element, organ, overlap) %>% unique() %>%
ggplot(aes(x=sex)) +
  facet_wrap(~overlap, nrow = 5, ncol=3) +
  geom_point(aes(y= mean, color = element), size=4) +
  geom_line(aes(y= mean, group = name, color=element), size=2) +
  scale_color_manual(values = met.brewer("Egypt")) +
  labs(x=NULL, y="Mean Expression", color=NULL) +
  theme(legend.position = "top",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'name', 'sex', 'organ'. You can override
## using the `.groups` argument.

ggsave("results/rddm_leaves_candidates.png", width = 20, height = 36, dpi = 600)

# plot the best ones

best_leaves = filter(over_leaves, gene_name %in% c("Silat_chr10_Gene.49028",
                                                   "Silat_chr12_Gene.2070",
                                                   "Silat_chr9_Gene.46125"))$overlap

dough %>% filter(overlap %in% best_leaves) %>% group_by(name, sex, organ) %>% summarise(mean = mean(expression), sex, name, element, organ, overlap) %>% unique() %>% left_join(over_leaves[,c("gene_name", "overlap")], by="overlap") %>%
ggplot(aes(x=sex)) +
  facet_wrap(~gene_name, nrow = 2, ncol=3) +
  geom_point(aes(y= mean, color = element), size=4) +
  geom_line(aes(y= mean, group = name, color=element), size=2) +
  scale_color_manual(values = met.brewer("Egypt")) +
  labs(x=NULL, y="Mean Expression", color=NULL) +
  theme(legend.position = "top",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'name', 'sex', 'organ'. You can override
## using the `.groups` argument.

ggsave("results/rddm_leaves_candidates_best.png", width = 14, height = 4, dpi = 600)

######################
rm(transposed, fermented_gene_bait, fermented_genes, fermented_rddm, fermented_rddm_bait, dough)
# select manually curated genes
best_flowers = filter(over_flowers, gene_name %in% c("Silat_chr1_Gene.4771",
                                                     "Silat_chr12_Gene.14608",
                                                     "Silat_chr12_Gene.26394",
                                                     "Silat_chr2_Gene.30622",
                                                     "Silat_chr3_Gene.37746",
                                                     "Silat_chr9_Gene.46125"))
best_leaves = filter(over_leaves, gene_name %in% c("Silat_chr10_Gene.49028",
                                                   "Silat_chr12_Gene.2070",
                                                   "Silat_chr9_Gene.46125"))

# get annotations
rddm_bed = read.table("results/rddm/rddm.bed")
colnames(rddm_bed) = c("chr_cluster", "start_cluster", "end_cluster", "cluster")

long_genes_bed = read.table("annotation/genes.bed")[,1:4] # I firts loaded the +2400bp but it is acctually not what we want
colnames(long_genes_bed) = c("chr_gene", "start_gene", "end_gene", "gene_name")

# merge annotations
best_flowers = left_join(best_flowers, long_genes_bed, by="gene_name") %>% left_join(rddm_bed, by="cluster")
best_leaves = left_join(best_leaves, long_genes_bed, by="gene_name") %>% left_join(rddm_bed, by="cluster")

# operations with coordinates
best_flowers = best_flowers %>% mutate(position =  ifelse(start_gene > end_cluster, "upstream", 
                                                          ifelse(start_cluster > end_gene, "downstream", 
                                                                 ifelse((start_cluster > start_gene) & (end_cluster < end_gene), "gene body", "NA")))) %>% 
  mutate(distance = ifelse(position=="upstream", (start_gene - start_cluster)/1000, 
                                          ifelse(position=="downstream", (start_cluster - end_gene)/1000, 0)))

best_leaves = best_leaves %>% mutate(position =  ifelse(start_gene > end_cluster, "upstream", 
                                                          ifelse(start_cluster > end_gene, "downstream", 
                                                                 ifelse((start_cluster > start_gene) & (end_cluster < end_gene), "gene body", "NA")))) %>% 
  mutate(distance = ifelse(position=="upstream", (start_gene - start_cluster)/1000, 
                                          ifelse(position=="downstream", (start_cluster - end_gene)/1000, 0)))

# plot

rbind(mutate(best_flowers, tissue="Flower buds"), mutate(best_leaves, tissue="Leaves")) %>% ggplot() +
  facet_grid(~tissue, scales = "free_y") +
  geom_point(aes(y=gene_name, x=distance, color=position), size=6) +
  geom_segment(aes(y=gene_name, yend=gene_name, x=0, xend=distance, color=position), size= 2) +
  labs(x="Distance to gene (Kbp)", y=NULL, color="Position of the sRNA cluster") +
  scale_color_manual(values = rev(met.brewer("Juarez"))) +
  theme_bw() +
  theme(text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        panel.spacing = unit(1, "lines"),
        aspect.ratio=1)

ggsave("results/rddm_distances.png", width = 12, height = 8, dpi = 600)




# TEs surrounding the transcription factor
read.table("results/rddm/tes_around_genes.tsv")[,-c(8,10,11)] %>% mutate(up = V2-V6, down = V6-V3, length = V7-V6) # some stats
##       V1       V2       V3                     V4    V5       V6       V7
## 1   chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1837703  1837859
## 2   chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1837860  1837885
## 3   chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1838213  1838685
## 4   chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1838729  1838781
## 5   chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1838730  1838791
## 6   chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1843979  1844190
## 7   chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1844139  1844197
## 8   chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1844970  1845139
## 9   chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1844990  1845017
## 10  chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1845018  1845168
## 11  chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1846017  1846043
## 12  chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1846038  1846099
## 13  chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1846044  1846113
## 14  chr3  1839790  1844111  Silat_chr3_Gene.37746  chr3  1846070  1846132
## 15 chr12 18331848 18335199 Silat_chr12_Gene.14608 chr12 18335464 18335560
##                                 V9    up  down length
## 1          Motif:rnd-5_family-2244  2087 -6408    156
## 2       Motif:RLC_SIRE_76712_FL_DL  1930 -6251     25
## 3  Motif:RLG_Tekay_54006_partial_D  1577 -5898    472
## 4           Motif:rnd-1_family-420  1061 -5382     52
## 5           Motif:rnd-1_family-420  1060 -5381     61
## 6           Motif:rnd-1_family-528 -4189  -132    211
## 7   Motif:RLG_Retand_14683_FL_DLTP -4349    28     58
## 8     Motif:RLG_Retand_90523_FL_DL -5180   859    169
## 9      Motif:RLG_Tekay_57285_FL_DL -5200   879     27
## 10           Motif:rnd-4_family-47 -5228   907    150
## 11      Motif:RLC_SIRE_90603_FL_DL -6227  1906     26
## 12  Motif:RLG_Athila_31436_FL_DLTP -6248  1927     61
## 13       Motif:RLG_CRM_70870_FL_DL -6254  1933     69
## 14     Motif:RLG_Reina_85746_FL_DL -6280  1959     62
## 15    Motif:RLG_Tekay_29122_FL_DLP -3616   265     96
# plot together the gene, sRNA and TEs
tf_vecinos = as.data.frame(mapply(c,
filter(rddm_bed, cluster=="Cluster_10374_RdDM") %>% mutate("sRNA cluster"),
read.table("results/rddm/clusters_around_genes.tsv")[, 5:8] %>% mutate("sRNA cluster"),
read.table("results/rddm/tes_around_genes.tsv")[1, 1:4] %>% mutate("Gene"),
read.table("results/rddm/tes_around_genes.tsv")[-15, c(5,6,7,9)] %>% mutate(V9 = paste(V9, 1:14, sep = "_"), "TEs")
)) 


colnames(tf_vecinos) = c("chr", "start", "end", "name", "element")
tf_vecinos$element = factor(tf_vecinos$element, levels = rev(c("Gene", "sRNA cluster", "TEs")))

ggplot(tf_vecinos, aes(xmin = as.numeric(start), xmax = as.numeric(end), y = element, fill = element)) +
  geom_gene_arrow() +
  labs(x="Chromosome 3", y=NULL) +
  scale_fill_manual(values = rev(met.brewer("Egypt", 3))) +
  scale_x_continuous(limits = c(1837000, 1846000)) +
  theme_light() +
  theme(text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1, size = 10))
## Warning: Removed 4 rows containing missing values (`geom_gene_arrow()`).

# THE LENGHTS IN THE PLOT WILL BE WRONG IF THEY ARE NOT CONTINOUS

ggsave("results/tf_vecinity.png", width = 8, height = 3, dpi = 600)
## Warning: Removed 4 rows containing missing values (`geom_gene_arrow()`).
rm(rddm_bed, long_genes_bed, best_flowers, best_leaves, tf_vecinos)
# We calculate depths for each sex-biased gene in Flowers
# NOTE: genes without sRNA mappping will NOT appear here

# Females
flo_rddm_fe = fread("results/rddm/flowers_depth_females.tsv")
flo_rddm_fe = surco_gene(flo_rddm_fe, 54935002, 29689313, 25245689)
colnames(flo_rddm_fe) = c("gene", "log_female_exp", "female_exp")

# Males
flo_rddm_ma = fread("results/rddm/flowers_depth_males.tsv")
flo_rddm_ma = surco_gene(flo_rddm_ma, 13224600, 23943681, 23582102)
colnames(flo_rddm_ma) = c("gene", "log_male_exp", "male_exp")

# marge and fill
flo_rddm = full_join(flo_rddm_fe, flo_rddm_ma, by="gene")

# plot
ggplot(flo_rddm) +
  annotate(geom = "polygon", x = c(-Inf, Inf, Inf), y = c(-Inf, Inf, -Inf), fill = "gray8", alpha = 0.2 ) +
  geom_abline(intercept = 0, slope = 1, alpha=0.6, color="red2", size=1, linetype="dashed") +
  geom_jitter(aes(log_female_exp, log_male_exp), color=met.brewer("Derain", 1), size=4) +
  scale_x_continuous(limits = c(-5, 4)) +
  scale_y_continuous(limits = c(-5, 4)) +
  labs(x= "♀ log2RPM sRNA", y="♂ log2RPM sRNA") +
  scale_fill_manual(values = met.brewer("Derain")) +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 
## Warning: Removed 96 rows containing missing values (`geom_point()`).

ggsave("results/gene-level_mapping_rddm.png", width = 8, height = 6, dpi = 600)
## Warning: Removed 96 rows containing missing values (`geom_point()`).
# select best candidates

flo_rddm_best = filter(flo_rddm, gene %in% c("Silat_chr1_Gene.4771",
                                                     "Silat_chr12_Gene.14608",
                                                     "Silat_chr12_Gene.26394",
                                                     "Silat_chr2_Gene.30622",
                                                     "Silat_chr3_Gene.37746",
                                                     "Silat_chr9_Gene.46125",
                                                     "Silat_chr10_Gene.49028",
                                                     "Silat_chr12_Gene.2070"))

flo_rddm_best = melt(flo_rddm_best, id.vars = "gene", measure.vars = c("log_female_exp", "log_male_exp"))

ggplot(flo_rddm_best, aes(x=variable)) +
  facet_wrap(~gene, nrow = 4, ncol=3) +
  geom_point(aes(y= value, color = gene), size=4) +
  geom_line(aes(y= value, group = gene, color=gene), size=2) +
  scale_color_manual(values = met.brewer("Egypt", 8)) +
  labs(x=NULL, y="Mean gene-level sRNA mapping", color=NULL) +
  scale_x_discrete(labels=c("log_female_exp" = "Female", "log_male_exp" = "Male")) +
  theme(legend.position = "none",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 

ggsave("results/gene-level_rddm_best.png", width = 16, height = 10, dpi = 600)
# new function that does not get averages
surco_best = function(toy, lib_size_1, lib_size_2, lib_size_3) {
          # set column names
          colnames(toy) = c("chr", "start", "end", "depth_1", "depth_2", "depth_3", "gene")
          # normalize by library size
          toy[, mean_depth_1 := depth_1/lib_size_1, by=1:nrow(toy)]
          toy[, mean_depth_2 := depth_2/lib_size_2, by=1:nrow(toy)]
          toy[, mean_depth_3 := depth_3/lib_size_3, by=1:nrow(toy)]
          # overall mean and multiply by 1m
          toy[, mean_depth := (mean_depth_1 + mean_depth_2 + mean_depth_3)*(1000000/3), by=1:nrow(toy)]
          # print bed file + mean depth per million
          toy[, .(gene, chr, start, mean_depth)]
}

# Females
flo_rddm_fe = fread("results/rddm/flowers_depth_females.tsv")
tf_fe = surco_best(flo_rddm_fe, 54935002, 29689313, 25245689)
colnames(tf_fe) = c("gene", "chr",  "start", "Females")
tf_fe = filter(tf_fe, gene=="Silat_chr3_Gene.37746")

# Males
flo_rddm_ma = fread("results/rddm/flowers_depth_males.tsv")
tf_ma = surco_best(flo_rddm_ma, 13224600, 23943681, 23582102)
colnames(tf_ma) = c("gene", "chr",  "start", "Males")
tf_ma = filter(tf_ma, gene=="Silat_chr3_Gene.37746")

# merge and reshape
full_join(tf_fe, tf_ma, by="start")[,c("start", "Females", "Males")] %>% melt(id.vars = "start", measure.vars = c("Females", "Males"), variable.name = "sex", value.name = "mapping") %>% ggplot()+
  geom_col(aes(x=as.numeric(start), y=mapping, fill=sex), stat = 'identity') +
  scale_y_continuous(limits = c(0,3)) +
  scale_x_continuous(limits = c(1837000, 1846000)) +
  scale_fill_manual(values = met.brewer("Derain")) +
  labs(x=NULL, y="RPM", fill="Sex") +
  theme_bw()+
  theme(text = element_text(size=20),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 10))
## Warning in geom_col(aes(x = as.numeric(start), y = mapping, fill = sex), :
## Ignoring unknown parameters: `stat`
## Warning: Removed 948 rows containing missing values (`position_stack()`).
## Warning: Removed 2 rows containing missing values (`geom_col()`).

ggsave("results/tf_mapping.png", width = 8, height = 2, dpi = 600)
## Warning: Removed 948 rows containing missing values (`position_stack()`).
## Removed 2 rows containing missing values (`geom_col()`).
rm(surco_best, flo_rddm_fe, flo_rddm_ma, tf_fe, tf_ma, flo_rddm, flo_rddm_best)

6.- Gene Co-expression network and GLMM

# get the union of DE genes (union works in pairs)
de_genes = union(flowers_genes[["clusters"]], leaves_genes[["clusters"]])
de_genes = as.matrix(cpm(kall, log = T)[de_genes, ])

# Only Flowers
{
de_genes = flowers_genes[["clusters"]]
de_genes = cpm(kall, log = T)[de_genes, ]
de_genes = as.data.frame(de_genes) %>% select(grep('B', colnames(de_genes)))
de_genes = as.matrix(de_genes)
}

# Only Leaves
{
de_genes = leaves_genes[["clusters"]]
de_genes = cpm(kall, log = T)[de_genes, ]
de_genes = as.data.frame(de_genes) %>% select(grep('L', colnames(de_genes)))
de_genes = as.matrix(de_genes)
}

# remove TE genes
#!(as.vector(unique(read.table("tesorter_genes/results.tsv")[,2])) %in% rownames(de_genes) ) 
!(as.vector(rownames(de_genes) %in% unique(read.table("tesorter_genes/results.tsv")[,2]))) 

#de_genes = de_genes[!(as.vector(unique(read.table("tesorter_genes/results.tsv")[,2])) %in% rownames(de_genes) ),]
de_genes = de_genes[!(as.vector(rownames(de_genes) %in% unique(read.table("tesorter_genes/results.tsv")[,2]))),]

# NOTE: within a total union 6678 DE genes, only 3705 are not TE-genes (55%)
# Between the sex-biased, only 243 are TE-genes

# get z-score
de_genes = t(apply(de_genes, 1, function(row) {
  (row - mean(row)) / sd(row)
}))

# Calculate correlations
weight_mat = GENIE3(de_genes, nCores=16, verbose=TRUE)
dim(weight_mat)
link_list = getLinkList(weight_mat, threshold=0.005) # report all 

# Check modules
my_network = graph_from_data_frame(link_list,  directed = F)
modules = cluster_leiden(my_network, resolution_parameter = 0.001, objective_function = "modularity") # 1 cluster in flowers+leaves and only leaves vs 54 clusters only in flowers

# get SB gene annotation, use those from the modules to select
#network_annotation = bait_df %>% select(gene, male_biased) %>% distinct(gene, .keep_all = TRUE) %>% filter(gene %in% modules$names) %>% mutate(bait = gene == "Silat_chr12_Gene.1440") # PTGS bait from flowers
network_annotation = bait_df %>% select(gene, male_biased) %>% distinct(gene, .keep_all = TRUE) %>% filter(gene %in% modules$names) %>% mutate(bait = gene == "Silat_chr3_Gene.37746") # RdDM bait from flowers

# get number of connections (edges) per node
edges_network = data.frame(gene = as.factor(V(my_network)),
                           no_edges =   igraph::degree(my_network, mode = "all") )
edges_network$gene = rownames(edges_network)
# merge annotations
network_annotation = left_join(network_annotation, edges_network, by="gene")
# re-anotate network
my_network = graph_from_data_frame(link_list,  directed = F, vertices = network_annotation)

# PLOT
ggraph(my_network, layout = "kk", circular = F) +
  geom_edge_diagonal(color = "grey70", width = 0.5, alpha = 0.8) +
  geom_node_point(aes(fill = male_biased, color=bait, shape=bait, size=no_edges), alpha = 0.8) +
  scale_fill_manual(values = met.brewer("Derain")) +
  scale_shape_manual(values = c(21, 19)) +
  scale_color_manual(values = c("black", "blue")) +
  theme_void() +
  theme(
    text = element_text(size = 20), 
    legend.position = "none"
  )

#ggsave("results/network.png", width = 8, height = 8, dpi = 600)
#ggsave("results/network_flowers.png", width = 8, height = 8, dpi = 600)
#ggsave("results/network_leaves.png", width = 8, height = 8, dpi = 600)

rm(de_genes, weight_mat, link_list, my_network, network_annotation, edges_network)

columns:

gene name bias average exp (tpm) 21-22 sRNA mapping (rpm) 24 sRNA mapping (rpm) sex tissue + comparison

library sizes: flowers females: 54935002, 29689313, 25245689 flowers males: 13224600, 23943681, 23582102 leaves females: 24695806, 31425418, 21443081 leaves males: 20502566, 17763390, 26371650

# get info of sex-biased genes in flowers
bias_flowers = read.table("results/differential_expression/sex_biased_flowers_sbge.bed")[, c(4,8)] %>% rename(V4="gene", V8="bias")
bias_leaves = read.table("results/differential_expression/sex_biased_leaves_sbge.bed")[, c(4,8)] %>% rename(V4="gene", V8="bias") 

# get raw expression data (reload again just in case)
fermented_genes = transpose(tpm)
colnames(fermented_genes) = rownames(tpm)
rownames(fermented_genes) = colnames(tpm)
fermented_genes = as.data.frame(fermented_genes) %>% mutate(sample = rownames(fermented_genes)) %>% relocate(sample) 
datainfo_s = kall_info %>% mutate(sample = rownames(kall_info)) %>% relocate(sample) %>% rename(organ="tissue")
fermented_genes = right_join(datainfo_s, fermented_genes, by = "sample")

# get genes and average replicates, add bias info
# do flowers and leaves indpendently, otherwise it's a mess to merge bias info (a gene may be differently biased in flowers or leaves)
# I selected the tissue where sex-biased was measured to avoid mixing up unrelated stuff (sbge in flowers == only flower samples)

exp_flowers = subset(fermented_genes, select = c("sample", "sex", "tissue", levels(as.factor(bias_flowers$gene)))) %>% melt(id.vars = c("sample", "sex", "tissue"), variable.name="gene") %>% group_by(sex, tissue, gene) %>% summarise(gene, mean_exp = mean(value), sex, tissue) %>% ungroup() %>% unique() %>% left_join(bias_flowers, by="gene") %>% filter(tissue == "Flower bud")
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'sex', 'tissue', 'gene'. You can override
## using the `.groups` argument.
exp_leaves = subset(fermented_genes, select = c("sample", "sex", "tissue", levels(as.factor(bias_leaves$gene)))) %>% melt(id.vars = c("sample", "sex", "tissue"), variable.name="gene") %>% group_by(sex, tissue, gene) %>% summarise(gene, mean_exp = mean(value), sex, tissue) %>% ungroup() %>% unique() %>% left_join(bias_leaves, by="gene")  %>% filter(tissue == "Leaf")
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'sex', 'tissue', 'gene'. You can override
## using the `.groups` argument.
rm(fermented_genes, datainfo_s)
####################################################################

# sRNA data

############ Flowers

#### 21/22 - PTGS 

# Females
flo_ptgs_fe = fread("results/gene-level/flowers_depth_females.tsv")
flo_ptgs_fe = surco_gene(flo_ptgs_fe, 54935002, 29689313, 25245689)
flo_ptgs_fe = rbind(
flo_ptgs_fe = flo_ptgs_fe %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_flowers$gene, flo_ptgs_fe$gene), mean_depth = 0)
) %>% mutate(sex = "Female", tissue = "Flower bud")

# Males
flo_ptgs_ma = fread("results/gene-level/flowers_depth_males.tsv")
flo_ptgs_ma = surco_gene(flo_ptgs_ma, 13224600, 23943681, 23582102)
flo_ptgs_ma = rbind(
flo_ptgs_ma = flo_ptgs_ma %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_flowers$gene, flo_ptgs_ma$gene), mean_depth = 0)
) %>% mutate(sex = "Male", tissue = "Flower bud")

# concatenate
flo_ptgs = rbind(flo_ptgs_fe, flo_ptgs_ma) %>% rename(mean_depth = "ptgs")
rm(flo_ptgs_fe, flo_ptgs_ma) 

#### 24 - RdDM

# Females
flo_rddm_fe = fread("results/rddm/flowers_depth_females.tsv")
flo_rddm_fe = surco_gene(flo_rddm_fe, 54935002, 29689313, 25245689)
flo_rddm_fe = rbind(
flo_rddm_fe = flo_rddm_fe %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_flowers$gene, flo_rddm_fe$gene), mean_depth = 0)
) %>% mutate(sex = "Female", tissue = "Flower bud")

# Males
flo_rddm_ma = fread("results/rddm/flowers_depth_males.tsv")
flo_rddm_ma = surco_gene(flo_rddm_ma, 13224600, 23943681, 23582102)
flo_rddm_ma = flo_rddm_ma %>% select(gene, mean_depth) %>% mutate(sex = "Male", tissue = "Flower bud") # all genes had depth info

# concatenate
flo_rddm = rbind(flo_rddm_fe, flo_rddm_ma) %>% rename(mean_depth = "rddm")
rm(flo_rddm_fe, flo_rddm_ma) 

#### merge

exp_flowers = left_join(exp_flowers, flo_ptgs, by=c("gene", "sex", "tissue")) %>% left_join(flo_rddm, by=c("gene", "sex", "tissue")) %>% mutate(comparison = "Sex-biased in flowers")
rm(flo_ptgs, flo_rddm)














############ Leaves

#### 21/22 - PTGS 

# Females
le_ptgs_fe = fread("results/gene-level/leaves_depth_females.tsv")
le_ptgs_fe = surco_gene(le_ptgs_fe, 24695806, 31425418, 21443081)
le_ptgs_fe = rbind(
le_ptgs_fe = le_ptgs_fe %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_leaves$gene, le_ptgs_fe$gene), mean_depth = 0)
) %>% mutate(sex = "Female", tissue = "Leaf")

# Males
le_ptgs_ma = fread("results/gene-level/leaves_depth_males.tsv")
le_ptgs_ma = surco_gene(le_ptgs_ma, 20502566, 17763390, 26371650)
le_ptgs_ma = rbind(
le_ptgs_ma = le_ptgs_ma %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_leaves$gene, le_ptgs_ma$gene), mean_depth = 0)
) %>% mutate(sex = "Male", tissue = "Leaf")

# concatenate
le_ptgs = rbind(le_ptgs_fe, le_ptgs_ma) %>% rename(mean_depth = "ptgs")
rm(le_ptgs_fe, le_ptgs_ma) 

#### 24 - RdDM 

# Females
le_rddm_fe = fread("results/rddm/leaves_depth_females.tsv")
le_rddm_fe = surco_gene(le_rddm_fe, 24695806, 31425418, 21443081)
le_rddm_fe = rbind(
le_rddm_fe = le_rddm_fe %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_leaves$gene, le_rddm_fe$gene), mean_depth = 0)
) %>% mutate(sex = "Female", tissue = "Leaf")

# Males
le_rddm_ma = fread("results/gene-level/leaves_depth_males.tsv")
le_rddm_ma = surco_gene(le_rddm_ma, 20502566, 17763390, 26371650)
le_rddm_ma = rbind(
le_rddm_ma = le_rddm_ma %>% select(gene, mean_depth),
data.frame(gene = setdiff(bias_leaves$gene, le_rddm_ma$gene), mean_depth = 0)
) %>% mutate(sex = "Male", tissue = "Leaf")

# concatenate
le_rddm = rbind(le_rddm_fe, le_rddm_ma) %>% rename(mean_depth = "rddm")
rm(le_rddm_fe, le_rddm_ma) 

#### merge

exp_leaves = left_join(exp_leaves, le_ptgs, by=c("gene", "sex", "tissue")) %>% left_join(le_rddm, by=c("gene", "sex", "tissue")) %>% mutate(comparison = "Sex-biased in leaves")
rm(le_ptgs, le_rddm)




########################

# Merge leaves and flowers
exp_all = rbind(exp_flowers, exp_leaves)
rm(exp_flowers, exp_leaves)

# Save the table
write.table(exp_all, file = "results/sbge_ptgs_rddm.tsv", quote = F, col.names = T, row.names = F, sep = "\t")




# plot correlation ptgs vs rddm
ggplot(exp_all) +
  geom_jitter(aes(log2(ptgs), log2(rddm)), color="black", size=1) +
  labs(x= "21/22-nt sRNA log2RPM", y="24-nt sRNA log2RPM") +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 

ggsave("results/ptgs_vs_rddm.png", width = 6, height = 4, dpi = 600)

# plot correlation ptgs vs sbge
ggplot(exp_all) +
  geom_jitter(aes(log2(mean_exp), log2(ptgs)), color="black", size=1) +
  facet_wrap(~comparison, ncol = 2, scales = "free_x") +
  labs(x= "mRNA log2TPM", y="21/22-nt sRNA log2RPM") +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
        aspect.ratio=1) 

ggsave("results/ptgs_vs_sbge.png", width = 8, height = 4, dpi = 600)

# plot correlation rddm vs sbge
ggplot(exp_all) +
  geom_jitter(aes(log2(mean_exp), log2(rddm)), color="black", size=1) +
  facet_wrap(~comparison, ncol = 2, scales = "free_x") +
  labs(x= "mRNA log2TPM", y="24-nt sRNA log2RPM") +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
        aspect.ratio=1) 

ggsave("results/rddm_vs_sbge.png", width = 8, height = 4, dpi = 600)

# Plot comparison Female vs Male sRNA mapping PTGS

ggplot(exp_all) +  
  geom_violin(aes(x=bias, y=log2(ptgs), fill=sex), color="white") +
  geom_boxplot(aes(x=bias, y=log2(ptgs), fill=sex), color="gray4", alpha=0.2, notch = T, position = position_dodge(width=0.9)) +
  facet_wrap(~tissue) +
  labs(x="Gene bias", y="21/22-nt sRNA log2RPM", fill="Sex expression") +
  scale_fill_manual(values = met.brewer("Derain")) +
  theme_bw() +
  theme(legend.position = "top",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 
## Warning: Removed 215 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 215 rows containing non-finite values (`stat_boxplot()`).
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

ggsave("results/ptgs_vs_sbge_sex.png", width = 10, height = 6, dpi = 600)
## Warning: Removed 215 rows containing non-finite values (`stat_ydensity()`).
## Removed 215 rows containing non-finite values (`stat_boxplot()`).
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
# Plot comparison Female vs Male sRNA mapping RdDM

ggplot(exp_all) +  
  geom_violin(aes(x=bias, y=log2(rddm), fill=sex), color="white") +
  geom_boxplot(aes(x=bias, y=log2(rddm), fill=sex), color="gray4", alpha=0.2, notch = T, position = position_dodge(width=0.9)) +
  facet_wrap(~tissue) +
  labs(x="Gene bias", y="24-nt sRNA log2RPM", fill="Sex expression") +
  scale_fill_manual(values = met.brewer("Derain")) +
  theme_bw() +
  theme(legend.position = "top",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 
## Warning: Removed 80 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 80 rows containing non-finite values (`stat_boxplot()`).

ggsave("results/rddm_vs_sbge_sex.png", width = 10, height = 6, dpi = 600)
## Warning: Removed 80 rows containing non-finite values (`stat_ydensity()`).
## Removed 80 rows containing non-finite values (`stat_boxplot()`).
rm(exp_all)

7.- Population-level diversity

# Flowers
# none of them are in the Y chr
pi_flo = drop_na(read.table("popgen/pi_flower_sbg_pi.txt", header = T) )[, 2:5]
colnames(pi_flo) = c("chr", "start", "end", "pi")

bias_flowers = read.table("results/differential_expression/sex_biased_flowers_sbge.bed")[, c(1:4,8)]
colnames(bias_flowers) = c("chr", "start", "end", "gene", "bias")

pi_flo = left_join(pi_flo, bias_flowers, by=c("chr", "start", "end"))


# load pi values from unibased genes
pi_unbiased = drop_na(read.table("popgen/pi_unbiased_sbg_pi.txt", header = T) )[, 2:5]
colnames(pi_unbiased) = c("chr", "start", "end", "pi")
pi_unbiased = pi_unbiased %>% mutate(gene = paste("un", 1:nrow(pi_unbiased), sep = "_"), bias = "unbiased")



# stats
shapiro.test(filter(pi_flo, bias=="male-biased")$pi) # not normally distributed
## 
##  Shapiro-Wilk normality test
## 
## data:  filter(pi_flo, bias == "male-biased")$pi
## W = 0.80766, p-value = 3.573e-12
shapiro.test(filter(pi_flo, bias=="female-biased")$pi) # not normally distributed
## 
##  Shapiro-Wilk normality test
## 
## data:  filter(pi_flo, bias == "female-biased")$pi
## W = 0.80349, p-value = 1.975e-07
shapiro.test(pi_unbiased$pi) # not normally distributed
## 
##  Shapiro-Wilk normality test
## 
## data:  pi_unbiased$pi
## W = 0.82899, p-value < 2.2e-16
var.test(x=filter(pi_flo, bias=="male-biased")$pi, y=filter(pi_flo, bias=="female-biased")$pi)
## 
##  F test to compare two variances
## 
## data:  filter(pi_flo, bias == "male-biased")$pi and filter(pi_flo, bias == "female-biased")$pi
## F = 1.4474, num df = 137, denom df = 58, p-value = 0.1117
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.9162821 2.1986361
## sample estimates:
## ratio of variances 
##           1.447436
# there is no significant difference between the variances 

t.test(x=filter(pi_flo, bias=="male-biased")$pi, y=filter(pi_flo, bias=="female-biased")$pi, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  filter(pi_flo, bias == "male-biased")$pi and filter(pi_flo, bias == "female-biased")$pi
## t = 0.97987, df = 195, p-value = 0.3284
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.009452139  0.028118916
## sample estimates:
## mean of x mean of y 
## 0.1604355 0.1511021
t.test(pi ~ bias, data = pi_flo, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  pi by bias
## t = -0.97987, df = 195, p-value = 0.3284
## alternative hypothesis: true difference in means between group female-biased and group male-biased is not equal to 0
## 95 percent confidence interval:
##  -0.028118916  0.009452139
## sample estimates:
## mean in group female-biased   mean in group male-biased 
##                   0.1511021                   0.1604355
# p values not significant (0.32), no difference

wilcox.test(pi ~ bias, data=pi_flo) 
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  pi by bias
## W = 3790.5, p-value = 0.4449
## alternative hypothesis: true location shift is not equal to 0
# p values not significant (0.44), no difference

# concatenate
pi_flo = rbind(pi_flo, pi_unbiased)

# comparison to unbiased
var.test(x=filter(pi_flo, bias=="male-biased")$pi, y=pi_unbiased$pi)
## 
##  F test to compare two variances
## 
## data:  filter(pi_flo, bias == "male-biased")$pi and pi_unbiased$pi
## F = 0.76258, num df = 137, denom df = 1407, p-value = 0.04257
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6022454 0.9907896
## sample estimates:
## ratio of variances 
##          0.7625785
var.test(x=filter(pi_flo, bias=="female-biased")$pi, y=pi_unbiased$pi)
## 
##  F test to compare two variances
## 
## data:  filter(pi_flo, bias == "female-biased")$pi and pi_unbiased$pi
## F = 0.52685, num df = 58, denom df = 1407, p-value = 0.00257
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3742442 0.7914290
## sample estimates:
## ratio of variances 
##          0.5268477
# there is A significant difference between the variances (p=0.4 for males and 0.002 for females)

wilcox.test(x=filter(pi_flo, bias=="male-biased")$pi, y=pi_unbiased$pi, var.equal = F)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  filter(pi_flo, bias == "male-biased")$pi and pi_unbiased$pi
## W = 91144, p-value = 0.23
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(x=filter(pi_flo, bias=="female-biased")$pi, y=pi_unbiased$pi, var.equal = F)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  filter(pi_flo, bias == "female-biased")$pi and pi_unbiased$pi
## W = 36245, p-value = 0.097
## alternative hypothesis: true location shift is not equal to 0
# p values not significant (males p=0.23, females p=0.09)

# plot all
ggplot(pi_flo) +
  geom_jitter(aes(x=bias, y=pi, color=bias), alpha=0.4) +
  geom_violin(aes(x=bias, y=pi, fill=bias), color="white", alpha=0.6) +
  geom_boxplot(aes(x=bias, y=pi, fill=bias), color="gray4", alpha=0.2, notch = T) +
  labs(x="Gene bias", y="Nucleotide diversity (Ï€)") +
  scale_fill_manual(values = met.brewer("Derain")[c(1,2,4)]) +
  scale_color_manual(values = met.brewer("Derain")[c(1,2,4)]) +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 20),
        panel.border = element_rect(color = "black", fill=NA),
        aspect.ratio=1) 

ggsave("results/pi.png", width = 8, height = 6, dpi = 600)

# Leaves

pi_le = drop_na(read.table("popgen/pi_leaves_sbg_pi.txt", header = T) )[, 2:5]
colnames(pi_le) = c("chr", "start", "end", "pi")
# only gene Silat_chr4_Gene.22085 is not SB in flowers too, pi=0.16 (around the mean)
bias_leaves = read.table("results/differential_expression/sex_biased_leaves_sbge.bed")[, c(1:4,8)]
colnames(bias_leaves) = c("chr", "start", "end", "gene", "bias")
pi_le = left_join(pi_le, bias_leaves, by=c("chr", "start", "end"))

rm(pi_flo, pi_le, pi_unbiased, bias_flowers, bias_leaves)